library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:graphics':
##
## layout
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
##
## last_plot
library(tidyverse)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(ggrepel)
library(broom)
library(dplyr)
theme_set(theme_minimal())
recent_grads <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-10-16/recent-grads.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## Major = col_character(),
## Major_category = col_character()
## )
## See spec(...) for full column specifications.
# we didn't get this dataset first, we get this data by plot the raw data frist, and figure out how we what processed the data
majors_processed <- recent_grads %>%
arrange(desc(Median)) %>%
mutate(Major = str_to_title(Major),
Major = fct_reorder(Major, Median))
I’ll also be aggregating by cateogyr.
by_majoy_cateotry <- majors_processed %>%
dplyr::filter(!is.na(Total)) %>%
group_by(Major_category) %>%
dplyr::summarize(Men = sum(Men),
Women = sum(Women),
Total = sum(Total),
MedianSalary = sum(Median * Sample_size) / sum(Sample_size)) %>%
mutate(ShareWomen = Women / Total) %>%
arrange(desc(ShareWomen))
Most common Majors
What major categories (e.g. enginerring, psycholog, business) were most common?
by_majoy_cateotry %>%
mutate(Major_category = fct_reorder(Major_category, Total)) %>%
gather(Gender, Number, Men, Women) %>%
ggplot(aes(Major_category, Number, fill = Gender)) +
geom_col() +
scale_y_continuous(labels = comma_format()) +
coord_flip() +
labs(title = "What are the most common major category",
x = "",
y = "Total # of graduess")

what categories of majors make more money than others
recent_grads %>%
mutate(Major_category = fct_reorder(Major_category, Median)) %>%
ggplot(aes(Major_category, Median, fill = Major_category)) +
geom_boxplot() +
scale_y_continuous(labels = dollar_format()) +
expand_limits(y = 0) +
coord_flip() +
theme(legend.position = "none")

what are the highest earning majors?
majors_processed %>%
head(20) %>%
ggplot(aes(Major,Median, color = Major_category)) +
scale_y_continuous(labels = dollar_format()) +
geom_point() +
geom_errorbar(aes(ymin = P25th, ymax = P75th)) +
coord_flip() +
expand_limits(y = 0)

majors_processed %>%
filter(Sample_size >= 100) %>%
head(20) %>%
ggplot(aes(Major,Median, color = Major_category)) +
scale_y_continuous(labels = dollar_format()) +
geom_point() +
geom_errorbar(aes(ymin = P25th, ymax = P75th)) +
coord_flip() +
expand_limits(y = 0) +
labs(title = "What are the highest-earning majors",
subtitle = "Top 20 majors with at least 100 graduates survey. Bars represent the 25th to 75th percentage",
x = "",
y = "median salary graduates")

How does gender breakdown related to typical earnings?
majors_processed %>%
arrange(desc(Total)) %>%
head(20) %>%
gather(Gender, Number, Men, Women) %>%
mutate(Major = fct_reorder(Major, Total)) %>%
ggplot(aes(Major, Number, fill = Gender)) +
geom_col() +
coord_flip()

by_majoy_cateotry %>%
ggplot(aes(ShareWomen, MedianSalary)) +
geom_point() +
geom_text_repel(aes(label = Major_category), force = .2) +
geom_smooth(method = "lm") +
expand_limits(y = 0) +
scale_y_continuous(labels = dollar_format())

When plot the correlation, you probabily don’t want to aggregate first. If you do you will loose a lot of information about variation. Keep us safe from Simpson’s Paradox. What problem could rise if we aggreate data first. For example, we can see in last chart, the health is the pretty high in terms of median income and women percentage. What we also know nurse also pretty high in women percentage but pretty low in terms of income level. Correlation within each category that are overall different than overall correlation.
g <- majors_processed %>%
filter(!is.na(Total)) %>%
mutate(Major_category = fct_lump(Major_category, n = 4)) %>%
ggplot(aes(ShareWomen, Median, color = Major_category, size = Sample_size, label = Major)) +
geom_point() +
scale_x_continuous(labels = percent_format()) +
scale_y_continuous(labels = dollar_format()) +
geom_smooth(method = "lm", aes(group = 1)) +
expand_limits(y = 0)
ggplotly(g)
majors_processed %>%
select(Major, Total, ShareWomen, Sample_size, Median) %>%
lm(Median ~ ShareWomen, data =., weights = Sample_size) %>%
summary()
##
## Call:
## lm(formula = Median ~ ShareWomen, data = ., weights = Sample_size)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -260500 -61042 -13899 33262 865081
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52073 1436 36.255 <2e-16 ***
## ShareWomen -23650 2403 -9.842 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 123000 on 170 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.363, Adjusted R-squared: 0.3592
## F-statistic: 96.87 on 1 and 170 DF, p-value: < 2.2e-16
majors_processed %>%
select(Major, Major_category, Total, ShareWomen, Sample_size, Median) %>%
add_count(Major_category) %>%
filter(n >= 10) %>%
nest(-Major_category) %>%
mutate(model = map(data, ~lm(Median ~ ShareWomen, data = ., weights = Sample_size)),
tidied = map(model, tidy)) %>%
unnest(tidied) %>%
filter(term == "ShareWomen") %>%
arrange(estimate) %>%
mutate(fdr = p.adjust(p.value, method = "fdr"))
## # A tibble: 9 x 7
## Major_category term estimate std.error statistic p.value fdr
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Biology & Life Scien… Share… -43735. 20982. -2.08 0.0592 0.106
## 2 Engineering Share… -33912. 15418. -2.20 0.0366 0.0937
## 3 Computers & Mathemat… Share… -28694. 18552. -1.55 0.156 0.235
## 4 Business Share… -28171. 9810. -2.87 0.0152 0.0937
## 5 Agriculture & Natura… Share… -16263. 5975. -2.72 0.0297 0.0937
## 6 Physical Sciences Share… -12820. 13349. -0.960 0.365 0.469
## 7 Education Share… -1996. 3084. -0.647 0.528 0.594
## 8 Humanities & Liberal… Share… -1814. 4128. -0.439 0.668 0.668
## 9 Health Share… 54721. 23427. 2.34 0.0416 0.0937
majors_processed %>%
filter(Sample_size >= 100) %>%
mutate(IQR = P75th - P25th) %>%
arrange(desc(IQR))
## # A tibble: 97 x 22
## Rank Major_code Major Total Men Women Major_category ShareWomen
## <dbl> <dbl> <fct> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 44 5007 Phys… 32142 23080 9062 Physical Scie… 0.282
## 2 18 2400 Gene… 61152 45683 15469 Engineering 0.253
## 3 21 2102 Comp… 128319 99743 28576 Computers & M… 0.223
## 4 11 2407 Comp… 41542 33258 8284 Engineering 0.199
## 5 37 5501 Econ… 139247 89749 49498 Social Science 0.355
## 6 43 2100 Comp… 36698 27392 9306 Computers & M… 0.254
## 7 107 5901 Tran… 15150 13257 1893 Industrial Ar… 0.125
## 8 17 2412 Indu… 18968 12453 6515 Engineering 0.343
## 9 36 6207 Fina… 174506 115030 59476 Business 0.341
## 10 12 2401 Aero… 15058 12953 2105 Engineering 0.140
## # ... with 87 more rows, and 14 more variables: Sample_size <dbl>,
## # Employed <dbl>, Full_time <dbl>, Part_time <dbl>,
## # Full_time_year_round <dbl>, Unemployed <dbl>, Unemployment_rate <dbl>,
## # Median <dbl>, P25th <dbl>, P75th <dbl>, College_jobs <dbl>,
## # Non_college_jobs <dbl>, Low_wage_jobs <dbl>, IQR <dbl>
Futuer work
- Examining Unemployment and fraction taking a job requrieing a college degree
- Examining interquartile ranges
Appendix
majors_processed %>%
ggplot(aes(Sample_size, Median)) +
geom_point() +
geom_text(aes(label = Major), check_overlap = T, vjust = 1, hjust = 1) +
scale_x_log10()

knitr::knit_exit()